## Data used here is sourced from the Memorial database, which is a database of victims of political repression in the Soviet Union.
## The data is available at https://lists.memo.ru and https://github.com/nextgis/memorial_data

## We leverage following tables included in the data package:
### nationality.csv @ List of individuals with government-designated nationalities
### person.csv @ List of individuals with names, patronym
### person_data.csv @ List of individuals with additional data, such as unique identifiers, names, patronymics, gender

library(tidyverse)


## Read in and process nationality table
nationality <- read_delim("source_data/nationality.csv", delim = ";") %>%
  mutate(
    
## This cleans up and aggregates nationalities into larger groups:
    ethnicity = case_when(
      str_detect(Description, "русск*|великор*") == T ~ 1, ## Russian
      str_detect(Description, "украин*|малорос*") == T ~ 2, ## Ukrainian
      str_detect(Description, "татар*") == T ~ 3, ## Tatar
      str_detect(Description, "поляк*|поль*") == T ~ 4, ## Pole
      str_detect(Description, "немк*|немец*|пруса*") == T ~ 5, ## German
      str_detect(Description, "евре*") == T ~ 6, ## Jew
      str_detect(Description, "грузин*|аджар*|мингр*|мегрел*|сван*|имеретин*|лаз*") == T ~ 7, ## Georgian
      str_detect(Description, "армян*") == T ~ 8, ## Armenian
      str_detect(Description, "азербайдж*") == T ~ 9, ## Azerbaijani
      str_detect(Description, "казаш*|казах*") == T ~ 10, ## Kazakh
      str_detect(Description, "белорус*|беларус*") == T ~ 11, ## Belarusian
      str_detect(Description, "калмы*|ойрот*|сарт-калмык*") == T ~ 12, ## Kalmyk
      str_detect(Description, "буря*") == T ~ 13, ## Buriat
      str_detect(Description, "кирги*|каракал*|кашгар*|таджи*|туркмен*|узбе*") == T ~ 14, ## Other central asian
      str_detect(Description, "латг*|латыш*|литов*|ингерм*|эстон*|фин*|карел*|саам*") == T ~ 15, ## Balts
      str_detect(Description, "абаз*|абхаз*|авар*|агул*|лачка*|адыг*|балкар*|кумык*|дагест*|даргин*|ингуш*|кабард*|карач*|лак*|лезги*|осет*|табаса*|талы*|тат *|месхе*|удин*|черке*|чечен*|шапсу*") == T ~ 16, ## Other Caucasus
      str_detect(Description, "башки*|караим*|крымч*|нога*|турок*|турч*|тюр*|уйгур*|хазар*|гагау*") == T ~ 17, ## Other turkic peoples, outside siberia
      str_detect(Description, "вепс*|вотя*|зыря*|ижем*|ижор*|коми*|лопар*|манси*|мари*|меще*|мокш*|мордв*|остя*|удмур*|хант*|череми*|чува*|эрз*|эрьз*|нен*|пермя*") == T ~ 18, ## Other finno-ugric peoples
      str_detect(Description, "тувы*|айн*|алеу*|алта*|аче*|гиля*|даур*|долга*|итель*|кет|коря*|нана*|нгана*|негида*|нивх*|оро*|саха*|сельк*|таз|телен*|телеу*|тунг*|удэ*|ульч*|хакас*|чукч*|шор*|эвен*|юка*|яку*") == T ~ 19, ## Siberian nationalities
      T ~ 0
    )
  ) %>%
  setNames(c("nationalityid", "description", "ethnicity"))

## Read in and process person_data table; merge with the nationality data

person_data <- read_delim("person_data.csv", col_names = F, na = c("\\N")) %>%
  setNames(c('persondataid','personid','birthplaceid','citizenshipid','nationalityid','educationid','partyid','socialoriginid'))%>%
  left_join(nationality, by = "nationalityid")

## Read in and process person table

person <- read_delim("person.csv", col_names = F, na = c("\\N")) %>%
  setNames(c('personid', 'surname', 'firstname', 'patronymic', 'gender', 'birthyearmin', 'firstrepressionyearmin', 'viewinfoname')) 

# Merge person with person data

person %>% 
  left_join(person_data, by = "personid") %>%
  mutate(
    surname = str_trim(surname),
    last_name = case_when(
      str_detect(surname, "ва$|вa-") == T ~ str_replace(surname, "ва$|вa-", "в"),
      str_detect(surname, "на$|нa-") == T ~ str_replace(surname, "на$|нa-", "н"),
      str_detect(surname, "ая$") == T ~ str_replace(surname, "ая$", "ий"),
      T ~ as.character(surname)
    )
  ) -> last_name_proc

# Calculate probabilities

last_name_proc %>%
  dplyr::select(surname, ethnicity) %>%
  filter(!is.na(ethnicity))%>%
  group_by(surname, ethnicity) %>%
  count() %>%
  ungroup() %>%
  group_by(surname)%>%
  mutate(
    probability = n/sum(n, na.rm=T)
  )%>%
  ungroup %>%
  arrange(desc(n))-> last_names

last_names %>%
  dplyr::select(surname, ethnicity,probability)%>%
  mutate(
    id = row.names(.),
    probability = case_when(
      is.na(probability) ~ 0,
      T ~ as.numeric(probability)
    )
  )%>% 
  pivot_wider(id_cols = c("surname"), names_from = "ethnicity", values_from = "probability", values_fill = 0,
              names_prefix = "ethn_") -> last_names_prob

# Write processed data into a separate file

write_csv(last_names_prob, "last_names.csv")